package com.virjar.dungproxy.server.crawler.extractor;
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
/*
* xml 模板解析器,通过输入的xml规则描述文件,抽取目标html文档的元素信息,返回一个JSON字符串 xml元素解释: name属性:在当前作用域内节点标识,下一级默认scope,同时提取成功的JSON串的key值为由name定义
* xpath:描述当前节点的位置或者值,(如当前节点不是最终节点,那么该xpath描述节点定位规则,如果当前节点没有子节点,那么描述的是一个文本值),xpath使用htmlcleaner解析,
* 请使用htmlcleaner支持的xpath语法 regex:文本抽取规则,为正则表达式,底层调用java正则,故请使用java支持的正则语法,另外规则使用group 1,如果作全文匹配,请使用圆括号包围
* regex将会作为数据是否提取成功的标识 test:测试数据是否提取成功,如果没有成功,test包围的属性将不会被录入。如果提取成功,数据将会设置到test的scope中,同时后续相同group的test将会被忽略,测试失败条件是:
* property的require属性不为true(默认为false),或者xpath没有成功提取,如果存在正则表达式,那么正则表达式提取或者匹配成功
* group:用在test中,表示test的分组,默认group为0,在相同group中,如果有一个提取成功,那么后续提取方案将会被忽略。
* scope:值存储空间,或者作用域。为一个树形结构关系,默认每一个匹配单元的scope由父节点的name定义,提取成功后的数据将会存放在scope中,fetch节点(文档根节点)的默认name为page,
* 也及在默认情况下提取单元可以强行将scope设置为page,那么对于数据将会存放在全局中,供其它模块调用 fetch:默认为false,如果为true,将会收集本作用域的值,并在fetch方法的返回值的存入
* check:如果设置check熟悉为true,那么本条记录仅仅是作为上下文判断的一个标记,提取的值不会存入。check主要设计为和test,value配合,三则配合可以实现存入xml自定义的值。
* 如:html中对于性别的定义是男、女。我想将最终结果中录入的性别设置为美女,帅哥。那么可以使用如下代码 <test group="3543654"> <property name="genderCheck" check="true"
* xpath="xxx" regex=".*(男).*"/> <property name="gender" value="帅哥"/> </test> 这样,如果按照html中抽取出了男这个词,将会在最终结果集中写入”帅哥“
* value:用户自定义value,如果用户指定了value,那么解析模块将不会执行,直接采纳value值 备注:由于类似网页的布局可能存在局部差异,所以一个特定的模板可能不能完整的提取所有数据,为提高数据完整性,可以通过以下方法
* 1.test元素,test要给group,如果group提取失败,那么将会忽略本次提取,数据将会由后续group定义。可以为数据设置多个test group
* 2.使用正则表达式,由于api对xpath支持不完整,故xpath对元素的定位可能存在误差,可以通过正则表达式验证提取数据内容,进而影响test的成功性。
* 3.同一个属性多个模板,数据提取采用覆盖机制,如果生一个匹配单元存入某个属性,后续遇到相同属性值的匹配单元,那么上一个属性的值将会被后一个覆盖(但是如果上一个属性的值不为null,后一个属性值为null,那么将会放弃覆盖)
*/
public class XmlModeFetcher {
private Document doc;
private HashMap<String, Pattern> pattenSpace;
private HashMap<String, ClassFetcher> classFetchers = new HashMap<String, ClassFetcher>();
private Logger logger = Logger.getLogger(XmlModeFetcher.class);
public XmlModeFetcher(File xmlfile) throws IOException, DocumentException {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(xmlfile)));
StringBuilder sb = new StringBuilder();
String temp = "";
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
br.close();
init(sb.toString());
}
private void init(String xml) throws DocumentException {
doc = DocumentHelper.parseText(xml);
pattenSpace = new HashMap<String, Pattern>();
Pattern regPattern = Pattern.compile("regex=['\"](.+?)['\"]");
Matcher regMatcher = regPattern.matcher(xml);
while (regMatcher.find()) {
String strReg = regMatcher.group(1);
strReg = strReg.replaceAll("&", "&");
strReg = strReg.replaceAll(""", "\"");
if (!pattenSpace.containsKey(strReg)) {
// System.out.println("compile regex:"+strReg);
logger.info("compile regex:" + strReg);
pattenSpace.put(strReg, Pattern.compile(strReg));
}
}
}
public XmlModeFetcher(String xml) throws DocumentException {
init(xml);
}
public List<String> fetch(String html) {
List<String> container = new ArrayList<String>();
TagNode tagNodeRoot = new HtmlCleaner().clean(html);
if (tagNodeRoot == null) {
return null;
}
NodeData nodDataRoot = new NodeData(null, "page");
innerFetch(container, doc.getRootElement(), nodDataRoot, nodDataRoot, tagNodeRoot, null);
return container;
}
private boolean setNodeDataValue(String name, String value, NodeData node, String scope,
HashMap<String, ScopeValue> failedValues) {
if (node == null) {
if (failedValues != null) {// set a failed value to failed containers
if (!failedValues.containsKey(name) || failedValues.get(name) == null) {
failedValues.put(name, new ScopeValue(value, scope));
}
}
return false;
}
if (node.getNodeName().equals(scope)) {
if (node.getProperties().containsKey(name) && node.getProperties().get(name) != null && value == null)
return true;
node.getProperties().put(name, value);
return true;
} else {
return setNodeDataValue(name, value, node.getParent(), scope, failedValues);
}
}
private String getNodeDataValue(String key, NodeData node) {
if (node == null)
return null;
if (node.getProperties().containsKey(key)) {
return node.getProperties().get(key);
} else {
return getNodeDataValue(key, node.getParent());
}
}
private boolean innerFetch(List<String> container, Element element, NodeData parent, NodeData root, TagNode tagnode,
HashMap<String, ScopeValue> failedValue) {
String xpath = element.attributeValue("xpath");
String name = element.attributeValue("name");
String scope = element.attributeValue("scope", parent.getNodeName());
String fetch = element.attributeValue("fetch", "false");
String regex = element.attributeValue("regex");
String require = element.attributeValue("require", "false");
String group = element.attributeValue("group", "0");
String check = element.attributeValue("check", "false");
String value = element.attributeValue("value");
String decoder = element.attributeValue("decoder");
String classfetcher = element.attributeValue("classfetcher");
if (classfetcher != null) {
TagNode classfetchnode = tagnode;
if (xpath != null) {
try {
Object Objs[] = tagnode.evaluateXPath(xpath);
for (Object obj : Objs) {
if (obj instanceof TagNode) {
classfetchnode = (TagNode) obj;
break;
}
}
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (!this.classFetchers.containsKey(classfetcher)) {
try {
classFetchers.put(classfetcher, (ClassFetcher) XmlModeFetcher.class.getClassLoader()
.loadClass(classfetcher).getDeclaredConstructor().newInstance());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
}
ClassFetcher fetcher = classFetchers.get(classfetcher);
String ret = fetcher.fetcher(classfetchnode, 0);
if (ret != null) {
if ("true".equals(check)) {
return true;
}
} else {
if ("true".equals(require)) {
return false;
}
}
setNodeDataValue(name, ret, parent, scope, failedValue);
return true;
}
// System.out.println("fetch:"+name);
if (regex != null) {
regex = regex.replaceAll("&", "&");
regex = regex.replaceAll(""", "\"");
}
if (element.getName().equals("fetch")) {// root
Iterator<Element> it = element.elementIterator();
while (it.hasNext()) {
Element childElement = it.next();
// NodeData nodedata = new NodeData(parent,name);
String rootscope = element.attributeValue("name", "page");
parent.nodeName = rootscope;
innerFetch(container, childElement, parent, root, tagnode, null);
}
} else if ("test".equals(element.getName())) {
// if group fetch is successful
if (parent.getGroupStates().containsKey(group) && parent.getGroupStates().get(group)) {
return true;
}
if (element.nodeCount() == 0) {
parent.getGroupStates().put(group, true);
return true;
}
Iterator<Element> it = element.elementIterator();
// new a temporary environment
List<String> tempcontainer = new ArrayList<String>();
NodeData temproot = new NodeData(null, scope);
boolean istestSuccessful = true;
HashMap<String, ScopeValue> failedValues = new HashMap<String, ScopeValue>();
while (it.hasNext()) {
Element childElement = it.next();
// NodeData nodedata = new NodeData(parent,name);
if (!innerFetch(tempcontainer, childElement, temproot, temproot, tagnode, failedValues)) {
istestSuccessful = false;
break;
}
}
if (istestSuccessful) {
parent.getGroupStates().put(group, true);
for (String jsonItem : tempcontainer) {
container.add(jsonItem);
}
Iterator<Entry<String, ScopeValue>> itfailedValues = failedValues.entrySet().iterator();
while (itfailedValues.hasNext()) {
Entry<String, ScopeValue> itemvalue = itfailedValues.next();
setNodeDataValue(itemvalue.getKey(), itemvalue.getValue().value, parent, itemvalue.getValue().scope,
null);
}
// Iterator<Entry<String, String>> tempit = temproot.properties.entrySet().iterator();
parent.properties.putAll(temproot.getProperties());
}
} else if (element.nodeCount() == 0) {
if (value != null) {
setNodeDataValue(name, value, parent, scope, failedValue);
return true;
}
if (xpath == null && name == null) {
return false;
}
if (xpath == null) {
String text = getNodeDataValue(name, parent);
if ("true".equals(require) && text == null)
return false;
setNodeDataValue(name, text, parent, scope, failedValue);
return true;
}
try {
Object textObjs[] = tagnode.evaluateXPath(xpath);
if (textObjs.length == 0) {
// logger.info("fetch failed for xpath:"+xpath+" parent
// xpath:"+element.getParent().attributeValue("xpath",""));
if ("true".equals(require) || "true".equals(check))
return false;
return true;
}
String text = textObjs[0].toString().trim();
// System.out.println("before regex:"+text);
// System.out.println("regex:"+regex);
if (regex != null) {
Pattern pattern = pattenSpace.get(regex);
if (pattern != null) {
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
text = matcher.group(1);
} else {
text = null;
if ("true".equals(require))
return false;
}
}
}
if (decoder != null)
text = Decoder.decode(text, decoder);
// System.out.println("end regex:"+text);
// store value
if ("true".equals(check)) {
return true;
}
setNodeDataValue(name, text, parent, scope, failedValue);
} catch (XPatherException e) {
// TODO Auto-generated catch block
// logger.info("fetch failed for xpath:"+xpath+" parent
// xpath:"+element.getParent().attributeValue("xpath",""));
logger.error(e, e);
e.printStackTrace();
return true;
}
return true;
} else {// combine
Object[] selfNode = null;
try {
// logger.info("get self node ,relative xpath is:"+xpath);
if (xpath.equals("./../") || xpath.equals("../")) {
selfNode = new Object[1];
selfNode[0] = tagnode.getParent();
} else {
selfNode = tagnode.evaluateXPath(xpath);
}
// logger.info("node number:"+selfNode.length);
JSONArray jsonArray = new JSONArray();
fetchFailed: for (Object obj : selfNode) {
if (obj instanceof TagNode) {
Iterator<Element> it = element.elementIterator();
NodeData nodedata = new NodeData(parent, name);
while (it.hasNext()) {
Element childElement = it.next();
if (!innerFetch(container, childElement, nodedata, root, (TagNode) obj, null))
continue fetchFailed;
}
// store json string value to container
Set<Entry<String, String>> dataset = nodedata.getProperties().entrySet();
Iterator<Entry<String, String>> entryit = dataset.iterator();
// JSONObject jsonObject = new JSONObject(nodedata.getProperties());//why ?????
JSONObject jsonObject = new JSONObject();
while (entryit.hasNext()) {
Entry<String, String> entry = entryit.next();
/*
* if(entry.getValue()==null){ entry.setValue(""); }
*/
jsonObject.put(entry.getKey(), entry.getValue());
}
jsonArray.add(jsonObject);
if (fetch.equals("true")) {
container.add(jsonObject.toJSONString());
}
}
}
String jsonArrayStr = jsonArray.toJSONString();
if (!"[{}]".equals(jsonArrayStr) && !"[]".equals(jsonArrayStr))
setNodeDataValue(name, jsonArrayStr, parent, scope, failedValue);
} catch (XPatherException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return true;
}
}
return true;
}
private class NodeData {
private NodeData parent;
private String nodeName;
private HashMap<String, String> properties;
private HashMap<String, Boolean> groupStates;
public NodeData(NodeData parent, String nodeName) {
super();
this.parent = parent;
this.nodeName = nodeName;
properties = new HashMap<String, String>();
groupStates = new HashMap<String, Boolean>();
}
public NodeData getParent() {
return parent;
}
public HashMap<String, String> getProperties() {
return properties;
}
public String getNodeName() {
return nodeName;
}
public HashMap<String, Boolean> getGroupStates() {
return groupStates;
}
}
private class ScopeValue {
private String value;
private String scope;
public ScopeValue(String value, String scope) {
super();
this.value = value;
this.scope = scope;
}
}
}